#required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
#machine learning models libraries
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import RidgeCV
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
#preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error as mae
#data related libraries
from datetime import date
import datetime
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
stores = pd.read_csv("stores.csv")
features = pd.read_csv("features.csv")
sample_submission = pd.read_csv("sampleSubmission.csv")
print("-----------------------------FEATURES---------------------\n")
print(features.head())
print("-----------------------------STORES-----------------------\n")
print(stores.head())
print("-----------------------------TRAIN------------------------\n")
print(train.head())
print("-----------------------------TEST-------------------------\n")
print(test.head())
print("-------------------------SUBMISSION SAMPLE----------------\n")
print(sample_submission.head())
-----------------------------FEATURES---------------------
Store Date Temperature Fuel_Price MarkDown1 MarkDown2 \
0 1 2010-02-05 42.31 2.572 NaN NaN
1 1 2010-02-12 38.51 2.548 NaN NaN
2 1 2010-02-19 39.93 2.514 NaN NaN
3 1 2010-02-26 46.63 2.561 NaN NaN
4 1 2010-03-05 46.50 2.625 NaN NaN
MarkDown3 MarkDown4 MarkDown5 CPI Unemployment IsHoliday
0 NaN NaN NaN 211.096358 8.106 False
1 NaN NaN NaN 211.242170 8.106 True
2 NaN NaN NaN 211.289143 8.106 False
3 NaN NaN NaN 211.319643 8.106 False
4 NaN NaN NaN 211.350143 8.106 False
-----------------------------STORES-----------------------
Store Type Size
0 1 A 151315
1 2 A 202307
2 3 B 37392
3 4 A 205863
4 5 B 34875
-----------------------------TRAIN------------------------
Store Dept Date Weekly_Sales IsHoliday
0 1 1 2010-02-05 24924.50 False
1 1 1 2010-02-12 46039.49 True
2 1 1 2010-02-19 41595.55 False
3 1 1 2010-02-26 19403.54 False
4 1 1 2010-03-05 21827.90 False
-----------------------------TEST-------------------------
Store Dept Date IsHoliday
0 1 1 2012-11-02 False
1 1 1 2012-11-09 False
2 1 1 2012-11-16 False
3 1 1 2012-11-23 True
4 1 1 2012-11-30 False
-------------------------SUBMISSION SAMPLE----------------
Id Weekly_Sales
0 1_1_2012-11-02 0
1 1_1_2012-11-09 0
2 1_1_2012-11-16 0
3 1_1_2012-11-23 0
4 1_1_2012-11-30 0
We can see that the test dataset don't contain the features included in the train dataset, taking into consideration that these features (Temperature, Fuel price, MarkDowns, CPI and Unemployment) cannot be used in the test dataset due to their high dependences on the date, so it will be a good idea to delete them. but before that, we will make sure that these features don't provide any information on the target 'Weekly_Sales'.
# Finding the number of rowns and columns in dataframe
features.shape, train.shape, stores.shape, test.shape,sample_submission.shape
((8190, 12), (420212, 20), (45, 3), (115064, 19), (115064, 2))
# Some basic information of differnt column's data type of dataframe
print("<-----------------------------FEATURES--------------------->\n")
print(features.dtypes)
print("<----------------------------TRAIN DATA--------------------->\n")
print(train.dtypes)
print("<-----------------------------STORE DATA-------------------->\n")
print(stores.dtypes)
print("<-------------------------------TEST DATA------------------->\n")
print(test.dtypes)
<-----------------------------FEATURES---------------------> Store int64 Date object Temperature float64 Fuel_Price float64 MarkDown1 float64 MarkDown2 float64 MarkDown3 float64 MarkDown4 float64 MarkDown5 float64 CPI float64 Unemployment float64 IsHoliday bool dtype: object <----------------------------TRAIN DATA---------------------> Store int64 Dept int64 Date object Weekly_Sales float64 IsHoliday bool dtype: object <-----------------------------STORE DATA--------------------> Store int64 Type object Size int64 dtype: object <-------------------------------TEST DATA-------------------> Store int64 Dept int64 Date object IsHoliday bool dtype: object
Let's start by cleaning the data of both datasets. We will see if they have missing values, duplicates and see if eliminate them if thats the case.
Very important to take into account that both datasets are going to merge. Therefore, they must have one key column that has the same values. Hence, We will also see if the values are consistent in both datasets.
feature_store = features.merge(stores, how='inner', on = "Store")
train = train.merge(feature_store, how='inner', on=['Store','Date','IsHoliday'])
test = test.merge(feature_store, how='inner', on=['Store','Date','IsHoliday'])
Another useful step is to facilate the acces to the 'Date' attribute by splitting it into its componenents (i.e. Year, Month and week,day).
train = train.copy()
test = test.copy()
train['Date'] = pd.to_datetime(train['Date'])
train['Year'] = pd.to_datetime(train['Date']).dt.year
train['Month'] = pd.to_datetime(train['Date']).dt.month
train['Week'] = pd.to_datetime(train['Date']).dt.week
train['Day'] = pd.to_datetime(train['Date']).dt.day
train.replace({'A': 1, 'B': 2,'C':3},inplace=True)
test['Date'] = pd.to_datetime(test['Date'])
test['Year'] = pd.to_datetime(test['Date']).dt.year
test['Month'] = pd.to_datetime(test['Date']).dt.month
test['Week'] = pd.to_datetime(test['Date']).dt.week
test['Day'] = pd.to_datetime(test['Date']).dt.day
test.replace({'A': 1, 'B': 2,'C':3},inplace=True)
print("-------------------------TRAIN DATA------------------------\n")
print(train.head())
print("\n\n<---------------------TEST DATA------------------------->\n")
print(test.head())
-------------------------TRAIN DATA------------------------
Store Dept Date Weekly_Sales IsHoliday Temperature Fuel_Price \
0 1 1 2010-02-05 24924.50 False 42.31 2.572
1 1 2 2010-02-05 50605.27 False 42.31 2.572
2 1 3 2010-02-05 13740.12 False 42.31 2.572
3 1 4 2010-02-05 39954.04 False 42.31 2.572
4 1 5 2010-02-05 32229.38 False 42.31 2.572
MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI \
0 NaN NaN NaN NaN NaN 211.096358
1 NaN NaN NaN NaN NaN 211.096358
2 NaN NaN NaN NaN NaN 211.096358
3 NaN NaN NaN NaN NaN 211.096358
4 NaN NaN NaN NaN NaN 211.096358
Unemployment Type Size Year Month Week Day
0 8.106 1 151315 2010 2 5 5
1 8.106 1 151315 2010 2 5 5
2 8.106 1 151315 2010 2 5 5
3 8.106 1 151315 2010 2 5 5
4 8.106 1 151315 2010 2 5 5
<---------------------TEST DATA------------------------->
Store Dept Date IsHoliday Temperature Fuel_Price MarkDown1 \
0 1 1 2012-11-02 False 55.32 3.386 6766.44
1 1 2 2012-11-02 False 55.32 3.386 6766.44
2 1 3 2012-11-02 False 55.32 3.386 6766.44
3 1 4 2012-11-02 False 55.32 3.386 6766.44
4 1 5 2012-11-02 False 55.32 3.386 6766.44
MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI Unemployment Type \
0 5147.7 50.82 3639.9 2737.42 223.462779 6.573 1
1 5147.7 50.82 3639.9 2737.42 223.462779 6.573 1
2 5147.7 50.82 3639.9 2737.42 223.462779 6.573 1
3 5147.7 50.82 3639.9 2737.42 223.462779 6.573 1
4 5147.7 50.82 3639.9 2737.42 223.462779 6.573 1
Size Year Month Week Day
0 151315 2012 11 44 2
1 151315 2012 11 44 2
2 151315 2012 11 44 2
3 151315 2012 11 44 2
4 151315 2012 11 44 2
train.columns
Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday', 'Temperature',
'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4',
'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size', 'Year', 'Month',
'Week', 'Day'],
dtype='object')
train.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Store | 421570.0 | 22.200546 | 12.785297 | 1.000 | 11.000000 | 22.00000 | 33.000000 | 45.000000 |
| Dept | 421570.0 | 44.260317 | 30.492054 | 1.000 | 18.000000 | 37.00000 | 74.000000 | 99.000000 |
| Weekly_Sales | 421570.0 | 15981.258123 | 22711.183519 | -4988.940 | 2079.650000 | 7612.03000 | 20205.852500 | 693099.360000 |
| Temperature | 421570.0 | 60.090059 | 18.447931 | -2.060 | 46.680000 | 62.09000 | 74.280000 | 100.140000 |
| Fuel_Price | 421570.0 | 3.361027 | 0.458515 | 2.472 | 2.933000 | 3.45200 | 3.738000 | 4.468000 |
| MarkDown1 | 150681.0 | 7246.420196 | 8291.221345 | 0.270 | 2240.270000 | 5347.45000 | 9210.900000 | 88646.760000 |
| MarkDown2 | 111248.0 | 3334.628621 | 9475.357325 | -265.760 | 41.600000 | 192.00000 | 1926.940000 | 104519.540000 |
| MarkDown3 | 137091.0 | 1439.421384 | 9623.078290 | -29.100 | 5.080000 | 24.60000 | 103.990000 | 141630.610000 |
| MarkDown4 | 134967.0 | 3383.168256 | 6292.384031 | 0.220 | 504.220000 | 1481.31000 | 3595.040000 | 67474.850000 |
| MarkDown5 | 151432.0 | 4628.975079 | 5962.887455 | 135.160 | 1878.440000 | 3359.45000 | 5563.800000 | 108519.280000 |
| CPI | 421570.0 | 171.201947 | 39.159276 | 126.064 | 132.022667 | 182.31878 | 212.416993 | 227.232807 |
| Unemployment | 421570.0 | 7.960289 | 1.863296 | 3.879 | 6.891000 | 7.86600 | 8.572000 | 14.313000 |
| Type | 421570.0 | 1.589912 | 0.666337 | 1.000 | 1.000000 | 1.00000 | 2.000000 | 3.000000 |
| Size | 421570.0 | 136727.915739 | 60980.583328 | 34875.000 | 93638.000000 | 140167.00000 | 202505.000000 | 219622.000000 |
| Year | 421570.0 | 2010.968591 | 0.796876 | 2010.000 | 2010.000000 | 2011.00000 | 2012.000000 | 2012.000000 |
| Month | 421570.0 | 6.449510 | 3.243217 | 1.000 | 4.000000 | 6.00000 | 9.000000 | 12.000000 |
| Week | 421570.0 | 25.826762 | 14.151887 | 1.000 | 14.000000 | 26.00000 | 38.000000 | 52.000000 |
| Day | 421570.0 | 15.673131 | 8.753549 | 1.000 | 8.000000 | 16.00000 | 23.000000 | 31.000000 |
There are missing values for the MarkDown columns as mentioned previously. Approximately 70% of values are missing in each MarkDown column.
The plot makes the right skewness clear, so most weeks have sales around the median. Also, we can see that the Weekly_Sales attribute has a large kurtosis which indicates the presence of extreme values, in other words, some weeks have high sales. It would be a good idea to know the origins of these extreme values.
weekly_sales = train.groupby(['Year','Week']).agg({'Weekly_Sales': ['mean', 'median']})
weekly_sales2010 = train.loc[train['Year']==2010].groupby(['Week']).agg({'Weekly_Sales': ['mean', 'median']})
weekly_sales2011 = train.loc[train['Year']==2011].groupby(['Week']).agg({'Weekly_Sales': ['mean', 'median']})
weekly_sales2012 = train.loc[train['Year']==2012].groupby(['Week']).agg({'Weekly_Sales': ['mean', 'median']})
plt.figure(figsize=(18, 5))
sns.lineplot(weekly_sales2010['Weekly_Sales']['mean'].index, weekly_sales2010['Weekly_Sales']['mean'].values)
sns.lineplot(weekly_sales2011['Weekly_Sales']['mean'].index, weekly_sales2011['Weekly_Sales']['mean'].values)
sns.lineplot(weekly_sales2012['Weekly_Sales']['mean'].index, weekly_sales2012['Weekly_Sales']['mean'].values)
plt.grid()
plt.xticks(np.arange(1, 53, step=1))
plt.ylabel("Sales")
plt.title("YEAR-WISE WEEKLY SALES ")
plt.legend(['2010', '2011', '2012'])
plt.show()
Inference :
Here the Target Variable Weekly_Sales is numerical nature. So we can use Pearson Correlation coefficient or Spearman's rank correlation to find the Correlation between variables
corr = train.corr()
fig, ax = plt.subplots(figsize = (15, 10))
corr_plot = sns.heatmap(corr, annot = True)
plt.show()
Inference:
# Let's also identify the numeric and categorical columns.
numeric_cols = train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train.select_dtypes('object').columns.tolist()
print("<-----------------NUMERICAL COLUMNS------------------------------------->\n")
print(numeric_cols)
print("\n<------------------CATEGORICAL COLUMNS----------------------------------->\n")
print(categorical_cols)
<-----------------NUMERICAL COLUMNS-------------------------------------> ['Store', 'Dept', 'Weekly_Sales', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size', 'Year', 'Month', 'Week', 'Day'] <------------------CATEGORICAL COLUMNS-----------------------------------> []
# Check if there is any null value in train dataframe
train.isnull().sum()
Store 0 Dept 0 Date 0 Weekly_Sales 0 IsHoliday 0 Temperature 0 Fuel_Price 0 MarkDown1 270889 MarkDown2 310322 MarkDown3 284479 MarkDown4 286603 MarkDown5 270138 CPI 0 Unemployment 0 Type 0 Size 0 Year 0 Month 0 Week 0 Day 0 dtype: int64
# Check if there is any null value test in dataframe
test.isnull().sum()
Store 0 Dept 0 Date 0 IsHoliday 0 Temperature 0 Fuel_Price 0 MarkDown1 149 MarkDown2 28627 MarkDown3 9829 MarkDown4 12888 MarkDown5 0 CPI 38162 Unemployment 38162 Type 0 Size 0 Year 0 Month 0 Week 0 Day 0 dtype: int64
# Create the imputer
imputer = SimpleImputer(missing_values= np.NaN, strategy='mean')
# Fit the imputer to the numeric columns
imputer.fit(train[numeric_cols])
SimpleImputer()
#Replace all the null values
train[numeric_cols] =imputer.transform(train[numeric_cols])
# Check if there is any null value
train.isnull().sum()
Store 0 Dept 0 Date 0 Weekly_Sales 0 IsHoliday 0 Temperature 0 Fuel_Price 0 MarkDown1 0 MarkDown2 0 MarkDown3 0 MarkDown4 0 MarkDown5 0 CPI 0 Unemployment 0 Type 0 Size 0 Year 0 Month 0 Week 0 Day 0 dtype: int64
#!pip install pandas_profiling
import pandas_profiling as pp
# forming ProfileReport and save
# as output.html file
profile = pp.ProfileReport(train)
profile